library(tidyverse)
library(lubridate)
library(data.table)
library(stringr)
library(quanteda)
library(tokenizers)
library(stopwords)
library(distrom)
library(ggthemes)
library(stm)
library(gt)
library(Matrix)

theme_set(theme_few())

base_dir <- "../"
table_dir <- paste0(base_dir, "tables/")
plot_dir <- paste0(base_dir, "plots/")
data_dir <- paste0(base_dir, "data/")
output_dir <- paste0(base_dir, "output/")

quanteda_options(threads = 8)


transcripts_sp_mo <- readRDS(paste0(output_dir, "transcripts_sp_mo.rds"))

# read list of tea party candidates
tpcands <- fread(paste0(base_dir, "data/tea-party-candidates-with-time-varying-affiliation.csv"))
tpcands[name == "sharron angle gop victory committee", name := "angle, sharron"]
tpcands[name == "dold jr., robert james mr.", name := "dold, robert james"]
tpcands[name == "walorski (swihart), jackie", name := "walorski, jackie"]

tpcands[, name_last := str_extract(name, "(\\w+)(?=,)")]
tpcands[, name_first := str_extract(name, "(?<=, )(\\w+)")]
tpcands[grepl("\\)", name), name_first := str_extract(name, "(?<=\\()(\\w+)(?=\\))")]  # nicknames
tpcands[, name_middle := str_extract(name, "(?<=\\w )(\\w+)")]
tpcands[, name_to_look := if_else(is.na(name_middle), paste(name_first, name_last),
                        if_else(nchar(name_first) == 1, paste(name_middle, name_last),
                            paste(name_first, name_last)))]



# read list of ALL republican congressional / senate candidates in 2010 cycle
# excluding fringe candidates with no fundraising or primary vote share < 5%
allcands <- fread(paste0(base_dir, "data/dime_cong_elections_current.csv"))
allcands[is.na(ppct), ppct := 0]

repcands <- allcands %>%
  .[cycle == 2010 &
    party == "R" &
    seat %in% c("federal:house", "federal:senate") &
    ((pwinner == "W" | gwinner == "W") | (ppct >= 0.05 & total_receipts > 0)),
    .(name = Name, state, district)]

repcands[name == "WALORSKI (SWIHART), JACKIE", name := "WALORSKI, JACKIE"]
repcands[name == "YOUNG, C W", name := "YOUNG, CW"]
repcands[name == "DOLD JR., ROBERT JAMES MR.", name := "DOLD, ROBERT JAMES"]

repcands[, name := sub(" , ", ", ", name)]
repcands[, name_last := str_extract(name, "(\\w+)(?=,)")]
repcands[, name_first := str_extract(name, "(?<=, )(\\w+)")]
repcands[grepl("\\)", name), name_first := str_extract(name, "(?<=\\()(\\w+)(?=\\))")]  # nicknames
repcands[, name_middle := str_extract(name, "(?<=\\w )(\\w+)")]
repcands[, name_to_look := if_else(is.na(name_middle), paste(name_first, name_last),
                        if_else(nchar(name_first) == 1, paste(name_middle, name_last),
                            paste(name_first, name_last)))]
repcands[, name_to_look := tolower(name_to_look)]

# drop TP speakers
repcands <- repcands[!tpcands, on = .(name_to_look)]



# same for Dems
demcands <- allcands %>%
  .[cycle == 2010 &
    party == "D" &
    seat %in% c("federal:house", "federal:senate") &
    ((pwinner == "W" | gwinner == "W") | (ppct >= 0.05 & total_receipts > 0)),
    .(name = Name, state, district)]

demcands[name == "FOSTER, G. WILLIAM (BIL", name := "FOSTER, G. WILLIAM (BILL)"]
demcands[name == "BUTTERFIELD, G. K.", name := "BUTTERFIELD, G. K. (GK)"]
demcands[name == "RUPPERSBERGER, C.A. DUTCH", name := "RUPPERSBERGER, DUTCH"]
demcands[name == "MAYHUE, W. PAUL", name := "MAYHUE, PAUL"]

demcands[, name := sub(" , ", ", ", name)]
demcands[, name_last := str_extract(name, "(\\w+)(?=,)")]
demcands[, name_first := str_extract(name, "(?<=, )(\\w+)")]
demcands[grepl("\\)", name), name_first := str_extract(name, "(?<=\\()(\\w+)(?=\\))")]  # nicknames
demcands[, name_middle := str_extract(name, "(?<=\\w )(\\w+)")]
demcands[, name_to_look := if_else(is.na(name_middle), paste(name_first, name_last),
                        if_else(nchar(name_first) == 1, paste(name_middle, name_last),
                            paste(name_first, name_last)))]
demcands[, name_to_look := tolower(name_to_look)]


# placebo: female candidates
femcands <- allcands %>%
  .[cycle == 2010 &
    cand_gender == "F" &
    seat %in% c("federal:house", "federal:senate") &
    ((pwinner == "W" | gwinner == "W") | (ppct >= 0.05 & total_receipts > 0)),
    .(name = Name, state, district)]

femcands[name == "WALORSKI (SWIHART), JACKIE", name := "WALORSKI, JACKIE"]

femcands[, name := sub(" , ", ", ", name)]
femcands[, name_last := str_extract(name, "(\\w+)(?=,)")]
femcands[, name_first := str_extract(name, "(?<=, )(\\w+)")]
femcands[grepl("\\)", name), name_first := str_extract(name, "(?<=\\()(\\w+)(?=\\))")]  # nicknames
femcands[, name_middle := str_extract(name, "(?<=\\w )(\\w+)")]
femcands[, name_to_look := if_else(is.na(name_middle), paste(name_first, name_last),
                        if_else(nchar(name_first) == 1, paste(name_middle, name_last),
                            paste(name_first, name_last)))]
femcands[, name_to_look := tolower(name_to_look)]

### match identified speakers to names in transcripts
tpspeakers <- tpcands$name_to_look %>%
  map(~ transcripts_sp_mo[grepl(toupper(.), speaker, fixed = T)][, name_to_look := .]) %>%
  rbindlist

# drop some common names that cause false positives
exclude_tp <- data.table(name_to_look = c(
  "bill johnson",
  "mark reed",
  "john gomez",
  "gary miller",
  "edward martin",
  "paul smith",
  "robert bishop",
  "sandy adams"
))

tpspeakers <- tpspeakers[!exclude_tp, on = .(name_to_look)]

repspeakers <- repcands$name_to_look %>%
  map(~ transcripts_sp_mo[grepl(toupper(.), speaker, fixed = T)][, name_to_look := .]) %>%
  rbindlist

# drop some common names that cause false positives
exclude_reps <- data.table(name_to_look = c(
  "robert gibbs",  # also Obama's press secretary!
  "david smith",
  "christopher smith",
  "james taylor",
  "john adams",
  "peter schiff",
  "rick allen",
  "scott taylor",
  "james lee",
  "david camp",
  "kevin calvey",
  "ralph hall",
  "david hall",
  "michael murphy",
  "susan smith",
  "david hunt",
  "john tucker",
  "robert wittman",
  "richard green",
  "mark greenberg",
  "john cox",
  "jeff taylor",
  "jack bailey",
  "michael young",
  "michael young",
  "john griffin",
  "robert vaughn",
  "patrick murray",
  "eddie adams",
  "richard green",
  "enrique torres",
  "steven taylor",
  "craig miller",
  "charles wilson"
))

repspeakers <- repspeakers[!exclude_reps, on = .(name_to_look)]

demspeakers <- demcands$name_to_look %>%
  map(~ transcripts_sp_mo[grepl(toupper(.), speaker, fixed = T)][, name_to_look := .]) %>%
  rbindlist

exclude_dems <- data.table(name_to_look = c(
  "john adams",
  "david cook",
  "kevin powell",
  "gary johnson",
  "ron klein",
  "susan davis",
  "david robinson",
  "miguel ortiz",
  "robert scott",
  "paul morel",
  "andy wilson",
  "gary peters",
  "michael ross",
  "james bryan",
  "william owens",
  "charles wilson",
  "bill foster",
  "robert burton"
))

demspeakers <- demspeakers[!exclude_dems, on = .(name_to_look)]

femspeakers <- femcands$name_to_look %>%
  map(~ transcripts_sp_mo[grepl(toupper(.), speaker, fixed = T)][, name_to_look := .]) %>%
  rbindlist

exclude_women <- data.table(name_to_look = c(
    "susan davis",
    "susan smith"
))
femspeakers <- femspeakers[!exclude_women, on = .(name_to_look)]

# create indicators in transcripts by speaker file
transcripts_by_speaker <- readRDS(paste0(output_dir, "transcripts_by_speaker.rds"))
transcripts_by_speaker[, tea_party := as.numeric(speaker %in% unique(tpspeakers$speaker))]
transcripts_by_speaker[, dem := as.numeric(speaker %in% unique(demspeakers$speaker))]
transcripts_by_speaker[, rep := as.numeric((speaker %in% unique(repspeakers$speaker)) | tea_party == 1)]
transcripts_by_speaker[, female := as.numeric(speaker %in% unique(femspeakers$speaker))]

saveRDS(transcripts_by_speaker, file = paste0(output_dir, "transcripts_by_speaker.rds"))


### FIRST: COUNT WORDS BY CANDIDATES
##  COMPUTE FRACTION OF TOTAL FOR EACH CHANNEL-MONTH
tpwordcount <- tpspeakers[,
  .(tp_count = sum(word_count, na.rm = TRUE)),
  by = .(channel, month)
]
totalwords <- transcripts_sp_mo[, .(total_count = sum(word_count, na.rm = T)), by = .(channel, month)]

tpwordcount <- tpwordcount[totalwords, on = .(channel, month)]
tpwordcount <- tpwordcount[!is.na(month)]
tpwordcount[is.na(tp_count), tp_count := 0]

tpwordcount[, freq := tp_count / total_count]
tpwordcount[, c("freq_lb", "freq_ub") := .(freq - 1.96 * sqrt(freq * (1 - freq)) / sqrt(total_count), freq + 1.96 * sqrt(freq * (1 - freq)) / sqrt(total_count))]
tpwordcount[, channel := recode(channel, fox = "FNC", cnn = "CNN", msnbc = "MSNBC")]

##  SAME FOR GENERIC R's
repwordcount <- repspeakers[, .(rep_count = sum(word_count, na.rm = T)), by = .(channel, month)]
repwordcount <- repwordcount[totalwords, on = .(channel, month)]
repwordcount <- repwordcount[!is.na(month)]
repwordcount[is.na(rep_count), rep_count := 0]

repwordcount[, freq := rep_count / total_count]
repwordcount[, c("freq_lb", "freq_ub") := .(freq - 1.96 * sqrt(freq * (1 - freq)) / sqrt(total_count), freq + 1.96 * sqrt(freq * (1 - freq)) / sqrt(total_count))]
repwordcount[, channel := recode(channel, fox = "FNC", cnn = "CNN", msnbc = "MSNBC")]

##  AND FOR D's
demwordcount <- demspeakers[, .(dem_count = sum(word_count, na.rm = T)), by = .(channel, month)]
demwordcount <- demwordcount[totalwords, on = .(channel, month)]
demwordcount <- demwordcount[!is.na(month)]
demwordcount[is.na(dem_count), dem_count := 0]

demwordcount[, freq := dem_count / total_count]
demwordcount[, c("freq_lb", "freq_ub") := .(freq - 1.96 * sqrt(freq * (1 - freq)) / sqrt(total_count), freq + 1.96 * sqrt(freq * (1 - freq)) / sqrt(total_count))]
demwordcount[, channel := recode(channel, fox = "FNC", cnn = "CNN", msnbc = "MSNBC")]


### 
### FIGURE 1 a)
###

ggplot(aes(x = month, y = freq, group = channel), data = tpwordcount) +
  geom_line(aes(colour = channel)) +
  theme_few() +
  theme(text = element_text(size = 14)) +
  ylab("Fraction of Words Spoken by Tea Party-Affiliated Candidates") +
  xlab("Date") +
  scale_colour_manual(
    name = "Channel",
    values = c(CNN = "mediumpurple2", FNC = "darkred", MSNBC = "cornflowerblue")) +
  scale_x_date(date_breaks = "3 months", date_labels = "%b %y") +
  ylim(0,0.03)
ggsave(filename = paste0(plot_dir, "tpcands_coverage.pdf"), height = 6, width = 8)

###
### FIGURE B.1.1 a)
###

ggplot(aes(x = month, y = freq, group = channel), data = tpwordcount) +
  geom_line(aes(colour = channel)) +
  geom_ribbon(aes(ymin = freq_lb, ymax = freq_ub, fill = channel), alpha = 0.2) + 
  theme_few() +
  theme(text = element_text(size = 14)) +
  ylab("Fraction of Words Spoken by Tea Party-Affiliated Candidates") +
  xlab("Date") +
  scale_colour_manual(
    name = "Channel",
    values = c(CNN = "mediumpurple2", FNC = "darkred", MSNBC = "cornflowerblue")) +
  scale_fill_manual(
    name = "Channel",
    values = c(CNN = "mediumpurple2", FNC = "darkred", MSNBC = "cornflowerblue")) +
  scale_x_date(date_breaks = "3 months", date_labels = "%b %y") +
  ylim(0,0.03)
ggsave(filename = paste0(plot_dir, "tpcands_coverage_with_ci.pdf"), height = 6, width = 8)


###
### FIGURE 1 b)
###

ggplot(aes(x = month, y = freq, group = channel), data = repwordcount) +
  geom_line(aes(colour = channel)) +
  theme_few() +
  theme(text = element_text(size = 14)) +
  ylab("Fraction of Words Spoken by Mainstream Rep. Candidates") +
  xlab("Date") +
  scale_colour_manual(
    name = "Channel",
    values = c(CNN = "mediumpurple2", FNC = "darkred", MSNBC = "cornflowerblue")) +
  scale_x_date(date_breaks = "3 months", date_labels = "%b %y") +
  ylim(0, 0.03)

ggsave(filename = paste0(plot_dir, "repcands_coverage.pdf"), height = 6, width = 8)


###
### FIGURE B.1.1 b)
###

ggplot(aes(x = month, y = freq, group = channel), data = repwordcount) +
  geom_line(aes(colour = channel)) +
  geom_ribbon(aes(ymin = freq_lb, ymax = freq_ub, fill = channel), alpha = 0.2) +
  theme_few() +
  theme(text = element_text(size = 14)) +
  ylab("Fraction of Words Spoken by Mainstream Rep. Candidates") +
  xlab("Date") +
  scale_colour_manual(
    name = "Channel",
    values = c(CNN = "mediumpurple2", FNC = "darkred", MSNBC = "cornflowerblue")) +
  scale_fill_manual(
    name = "Channel",
    values = c(CNN = "mediumpurple2", FNC = "darkred", MSNBC = "cornflowerblue")) +
  scale_x_date(date_breaks = "3 months", date_labels = "%b %y") +
  ylim(0, 0.03)

ggsave(filename = paste0(plot_dir, "repcands_coverage_with_ci.pdf"), height = 6, width = 8)


### SECOND: APPLY G-S-T method

stemmed_ngram <- function(texts, n = 2) {
  texts %>%
    char_tolower %>%
    tokens(what = "word",
           remove_punct = TRUE,
           remove_symbols = TRUE,
           remove_url = TRUE) %>%
    tokens_remove(pattern = stopwords::stopwords("en")) %>%
    tokens_remove(pattern = c("*http*", "*www*", "*\\.com", "*\\.org")) %>%
    tokens_wordstem %>%
    tokens_ngrams(n = n) %>%
    as.character
}

tokenize_fully <- function(text) {
    text %>%
        tokenize_sentences %>%
        map(stemmed_ngram, n = 2) %>%
        tokens
}

transcripts_sp_mo[, tea_party := as.numeric(speaker %in% unique(tpspeakers$speaker))]
transcripts_sp_mo[, dem := as.numeric(speaker %in% unique(demspeakers$speaker))]
transcripts_sp_mo[, rep := as.numeric((speaker %in% unique(repspeakers$speaker)) | tea_party == 1)]


## Generate document-feature-matrix to fit model in the speech of politicians
polspeech <- rbind(tpspeakers[, c("tea_party", "rep") := list(1, 1)],
                  repspeakers[, c("tea_party", "rep") := list(0, 1)],
                  demspeakers[, c("tea_party", "rep") := list(0, 0)])

polspeech[, female := as.numeric(speaker %in% unique(femspeakers$speaker))]

polsdfm <- polspeech[, text] %>%
    tokenize_fully %>%
    dfm() %>%
    dfm_trim(min_docfreq = 0.0005,
             max_docfreq = 0.4,
             docfreq_type = "prop")

saveRDS(polsdfm, file = paste0(output_dir, "polsdfm.rds"))

## covariates
covars <- sparse.model.matrix(~ rep + tea_party + factor(month) - 1, data = polspeech)

## run DMR
phrase_models <- dmr(cl = makeForkCluster(nnodes = 6, outfile = paste0(output_dir, "dmr_log.txt")),
                    covars = covars,
                    counts = polsdfm,
                    fixedcost = 1e-5,
                    free = grep("month|channel", colnames(covars))
                    )

## extract phrase coefs
tvcoefs <- coef(phrase_models)

saveRDS(tvcoefs, file = paste0(output_dir, "tea_party_transcript_coefs.rds"))
tvcoefs <- readRDS(paste0(output_dir, "tea_party_transcript_coefs.rds"))
head(colnames(tvcoefs))

tvcoefs["tea_party", ] %>% sort %>% tail(20)
tvcoefs["tea_party", ] %>% sort %>% head(20)

tpscores <- tvcoefs["tea_party", ] %>% as("sparseVector")
repscores <- tvcoefs["rep", ] %>% as("sparseVector")


## covariates (placebo model)
covars_placebo <- sparse.model.matrix(~ rep + female + factor(month) - 1, data = polspeech)

## run DMR (placebo model)
phrase_models_placebo <- dmr(cl = makeForkCluster(nnodes = 6, outfile = paste0(output_dir, "dmr_log_placebo.txt")),
                    covars = covars_placebo,
                    counts = polsdfm,
                    fixedcost = 1e-5,
                    free = grep("month|channel", colnames(covars))
                    )

## extract phrase coefs (placebo)
tvcoefs_placebo <- coef(phrase_models_placebo)

saveRDS(tvcoefs_placebo, file = paste0(output_dir, "placebo_transcript_coefs.rds"))
tvcoefs_placebo <- readRDS(paste0(output_dir, "placebo_transcript_coefs.rds"))
head(colnames(tvcoefs_placebo))

tvcoefs_placebo["female", ] %>% sort %>% tail(20)
tvcoefs_placebo["female", ] %>% sort %>% head(20)

femscores <- tvcoefs_placebo["female", ] %>% as("sparseVector")
repscores_p <- tvcoefs_placebo["rep", ] %>% as("sparseVector")


## fit forward regression in-sample
polspeech[, tphat := as.numeric((polsdfm / rowSums(polsdfm, sparseResult = TRUE)) %*% tpscores)]
polspeech[, rhat := as.numeric((polsdfm / rowSums(polsdfm, sparseResult = TRUE)) %*% repscores)]

logit_model <- glm(tea_party ~ tphat + rhat, data = polspeech, family = binomial)
logit_model_r <- glm(rep ~ tphat + rhat, data = polspeech, family = binomial)

summary(logit_model)
#             Estimate Std. Error z value Pr(>|z|)    
# (Intercept) -2.79740    0.07734  -36.17   <2e-16 ***
# tphat        4.58814    0.15158   30.27   <2e-16 ***
# rhat         2.08184    0.10694   19.47   <2e-16 ***

summary(logit_model_r)
#             Estimate Std. Error z value Pr(>|z|)    
# (Intercept)  0.40585    0.04903   8.278   <2e-16 ***
# tphat        1.91052    0.09687  19.723   <2e-16 ***
# rhat         6.74900    0.18146  37.194   <2e-16 ***

save(logit_model, logit_model_r, file = paste0(output_dir, "forward_regressions.rds"))

## fit forward regression in-sample (placebo model)
polspeech[, femhat := as.numeric((polsdfm / rowSums(polsdfm, sparseResult = TRUE)) %*% femscores)]
polspeech[, rhat_p := as.numeric((polsdfm / rowSums(polsdfm, sparseResult = TRUE)) %*% repscores_p)]

logit_model_fem <- glm(female ~ femhat + rhat_p, data = polspeech, family = binomial)
logit_model_r_p <- glm(rep ~ femhat + rhat_p, data = polspeech, family = binomial)

summary(logit_model_fem)
#             Estimate Std. Error z value Pr(>|z|)    
# (Intercept) -1.51510    0.05416 -27.976   <2e-16 ***
# femhat       5.90937    0.17991  32.846   <2e-16 ***
# rhat_p       0.02792    0.08643   0.323    0.747    

summary(logit_model_r_p)
#             Estimate Std. Error z value Pr(>|z|)    
# (Intercept) -0.22577    0.05059  -4.463 8.10e-06 ***
# femhat      -0.42934    0.10643  -4.034 5.48e-05 ***
# rhat_p       6.64219    0.18120  36.656  < 2e-16 ***


save(logit_model_fem, logit_model_r_p, file = paste0(output_dir, "forward_regressions_placebo.rds"))

## predict, collapsing first
trdfm <- transcripts_sp_mo[, text] %>%
    tokenize_fully %>%
    dfm() %>%
    dfm_match(features = featnames(polsdfm))

saveRDS(trdfm, file = paste0(output_dir, "trdfm.rds"))
trdfm <- readRDS(paste0(output_dir, "trdfm.rds"))

transcripts_sp_mo[, channelmo := paste(channel, month, sep = "-")]


## predict at channel-month level
docvars(trdfm) <- transcripts_sp_mo[, .(channel, month, channelmo, rep, dem, tea_party, female)]

trdfm_channelmo <- trdfm %>%
  dfm_subset(rep == 0 & dem == 0 & tea_party == 0) %>% # exclude text spoken by pols on-air
  dfm_group(channelmo)

tp_pred_mo <- as.data.table(docvars(trdfm_channelmo))

tp_pred_mo[, tphat := as.numeric((trdfm_channelmo / rowSums(trdfm_channelmo, sparseResult = TRUE)) %*% tpscores)]
tp_pred_mo[, rhat := as.numeric((trdfm_channelmo / rowSums(trdfm_channelmo, sparseResult = TRUE)) %*% repscores)]
tp_pred_mo[, femhat := as.numeric((trdfm_channelmo / rowSums(trdfm_channelmo, sparseResult = TRUE)) %*% femscores)]
tp_pred_mo[, rhat_p := as.numeric((trdfm_channelmo / rowSums(trdfm_channelmo, sparseResult = TRUE)) %*% repscores_p)]
tp_pred_mo[, tp_pred := predict(logit_model, type = "response", newdata = tp_pred_mo)]
tp_pred_mo[, r_pred := predict(logit_model_r, type = "response", newdata = tp_pred_mo)]
tp_pred_mo[, fem_pred := predict(logit_model_fem, type = "response", newdata = tp_pred_mo)]

tp_pred_mo[, channel := recode(sub("-.*$", "", channelmo), fox = "FNC", cnn = "CNN", msnbc = "MSNBC")]
tp_pred_mo[, month := ymd(sub("\\w+-(.*)$", "\\1", channelmo))]

# the predicted value with no speech data
uninformative <- predict(logit_model, newdata = data.table(tphat = 0, rhat = 0), type = "response")
uninformative_r <- predict(logit_model_r, newdata = data.table(tphat = 0, rhat = 0), type = "response")

###
### FIGURE 3 a)
###

ggplot(aes(x = month, y = tp_pred, group = channel), data = tp_pred_mo) +
  geom_line(aes(colour = channel)) +
  ylab("Tea Party Language Score") +
  xlab("Date") +
  scale_colour_manual(
    name = "Channel",
    values = c(CNN = "mediumpurple2", FNC = "darkred", MSNBC = "cornflowerblue")) +
  scale_x_date(date_breaks = "1 month",
               date_labels = "%b %y",
               limits = c(ymd("2009-02-15", "2010-11-15"))) +
  theme(text = element_text(size = 20)) +
  ylim(0, 0.05) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1))

ggsave(filename = paste0(plot_dir, "appearance_GST_score.pdf"), height = 6, width = 8)

###
### FIGURE 3 b)
###

ggplot(aes(x = month, y = r_pred, group = channel), data = tp_pred_mo) +
  geom_line(aes(colour = channel)) +
  ylab("Republican Language Score") +
  xlab("Date") +
  scale_colour_manual(
    name = "Channel",
    values = c(CNN = "mediumpurple2", FNC = "darkred", MSNBC = "cornflowerblue")) +
  scale_x_date(date_breaks = "1 month",
               date_labels = "%b %y",
               limits = c(ymd("2009-02-15", "2010-11-15"))) +
  theme(text = element_text(size = 20)) +
  ylim(0, 0.75) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1))

ggsave(filename = paste0(plot_dir, "appearance_GST_score_R.pdf"), height = 6, width = 8)


ggplot(aes(x = month, y = fem_pred, group = channel), data = tp_pred_mo) +
  geom_line(aes(colour = channel)) +
  ylab("Female Politician Language Score") +
  xlab("Date") +
  scale_colour_manual(
    name = "Channel",
    values = c(CNN = "mediumpurple2", FNC = "darkred", MSNBC = "cornflowerblue")) +
  scale_x_date(date_breaks = "1 month",
               date_labels = "%b %y",
               limits = c(ymd("2009-02-15", "2010-11-15"))) +
  theme(text = element_text(size = 20)) +
  ylim(0, 0.07) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1))

ggsave(filename = paste0(plot_dir, "appearance_GST_score_female.pdf"), height = 6, width = 8)


## compute aggregate means and CIs for these via subsampling

# first: overall prediction at channel level
trdfm_channel <- trdfm %>%
  dfm_subset(rep == 0 & dem == 0 & tea_party == 0) %>% # exclude text spoken by pols on-air
  dfm_group(channel)

tp_pred_pool <- as.data.table(docvars(trdfm_channel))

tp_pred_pool[, tphat := as.numeric((trdfm_channel / rowSums(trdfm_channel, sparseResult = TRUE)) %*% tpscores)]
tp_pred_pool[, rhat := as.numeric((trdfm_channel / rowSums(trdfm_channel, sparseResult = TRUE)) %*% repscores)]
tp_pred_pool[, tp_pred := predict(logit_model, type = "response", newdata = tp_pred_pool)]
tp_pred_pool[, r_pred := predict(logit_model_r, type = "response", newdata = tp_pred_pool)]

tp_pred_pool[, channel := recode(channel, fox = "FNC", cnn = "CNN", msnbc = "MSNBC")]

# second: subsample, re-fit, predict 100x on 10% samples per G-S-T paper
# stratify sampling to ensure mainstream R's and tea-partiers present
subsample <- function(iter, covars, counts, ss_frac = 0.1) {
  cat("Iteration", iter, "...\n")

  # stratified sample blocking by mainstream rep, tea party rep, dem
  this_sample <- polspeech[, .(i = sample(.I, size = ceiling(.N * ss_frac), replace = FALSE)), by = .(rep, tea_party)][,i]
  
  count_sub <- counts[this_sample, ]
  polspeech_sub <- polspeech[this_sample, ]
  covar_sub <- sparse.model.matrix(~ rep + tea_party + factor(month) - 1, data = polspeech_sub)

  phrase_models <- dmr(cl = makeForkCluster(nnodes = 6, outfile = paste0(output_dir, "dmr_log.txt")),
                    covars = covar_sub,
                    counts = count_sub,
                    fixedcost = 1e-5,
                    free = grep("month|channel", colnames(covars))
                    )

  tvcoefs <- coef(phrase_models)
  tpscores <- tvcoefs["tea_party", ] %>% as("sparseVector")
  repscores <- tvcoefs["rep", ] %>% as("sparseVector")

  polspeech_sub[, tphat := as.numeric((count_sub / rowSums(count_sub, sparseResult = TRUE)) %*% tpscores)]
  polspeech_sub[, rhat := as.numeric((count_sub / rowSums(count_sub, sparseResult = TRUE)) %*% repscores)]

  logit_model <- glm(tea_party ~ tphat + rhat, data = polspeech_sub, family = binomial)
  logit_model_r <- glm(rep ~ tphat + rhat, data = polspeech_sub, family = binomial)

  tp_pred_pool_sub <- as.data.table(docvars(trdfm_channel))

  tp_pred_pool_sub[, tphat := as.numeric((trdfm_channel / rowSums(trdfm_channel, sparseResult = TRUE)) %*% tpscores)]
  tp_pred_pool_sub[, rhat := as.numeric((trdfm_channel / rowSums(trdfm_channel, sparseResult = TRUE)) %*% repscores)]
  tp_pred_pool_sub[, tp_pred := predict(logit_model, type = "response", newdata = tp_pred_pool_sub)]
  tp_pred_pool_sub[, r_pred := predict(logit_model_r, type = "response", newdata = tp_pred_pool_sub)]

  tp_pred_pool_sub[, channel := recode(channel, fox = "FNC", cnn = "CNN", msnbc = "MSNBC")]

  tp_pred_pool_sub[, id := iter]

  tp_pred_pool_sub

}

set.seed(138549)
1:200 %>%
  map(possibly(subsample, otherwise = data.table()),
      covars = covars,
      counts = polsdfm) %>% rbindlist -> 
subsample_ests

# get critical values
n <- nrow(polspeech)
b <- ceiling(n * 0.1)

# critical values from the subsampling distribution
crits <- subsample_ests[,
    .(tp_c_05 = sqrt(b) * quantile(tp_pred, 0.05),
      tp_c_95 = sqrt(b) * quantile(tp_pred, 0.95),
      r_c_05 = sqrt(b) * quantile(r_pred, 0.05),
      r_c_95 = sqrt(b) * quantile(r_pred, 0.95)),
    by = .(channel)]


conf_ints <- tp_pred_pool[crits, on = .(channel)] %>%
  .[, tp_pred_lb := tp_pred - (tp_c_95+tp_c_05) / 2 / sqrt(n)] %>%
  .[, tp_pred_ub := tp_pred + (tp_c_95+tp_c_05) / 2 / sqrt(n)] %>%
  .[, r_pred_lb := r_pred - (r_c_95+r_c_05) / 2/ sqrt(n)] %>%
  .[, r_pred_ub := r_pred + (r_c_95+r_c_05) / 2 / sqrt(n)]


###
### FIGURE B.3.1 a)
###

tp_conf_int_plot <- ggplot(aes(x = channel, colour = channel), data = conf_ints) +
  geom_point(aes(y = tp_pred), size = 5) +
  geom_linerange(aes(ymin = tp_pred_lb, ymax = tp_pred_ub), size = 2) +
  ylab("Tea Party Language Score") +
  xlab("Channel") +
  ylim(c(0.01,0.04)) +
  scale_colour_manual(
    name = "Channel",
    values = c(CNN = "mediumpurple2", FNC = "darkred", MSNBC = "cornflowerblue")) +
  theme(text = element_text(size = 20), legend.position="none")

ggsave(tp_conf_int_plot, filename = paste0(plot_dir, "appearance_GST_score_tp_confints.pdf"), height = 6, width = 8)

###
### FIGURE B.3.1 b)
###
r_conf_int_plot <- ggplot(aes(x = channel, colour = channel), data = conf_ints) +
  geom_point(aes(y = r_pred), size = 5) +
  geom_linerange(aes(ymin = r_pred_lb, ymax = r_pred_ub), size = 2) +
  ylab("Republican Language Score") +
  xlab("Channel") +
  scale_colour_manual(
    name = "Channel",
    values = c(CNN = "mediumpurple2", FNC = "darkred", MSNBC = "cornflowerblue")) +
  theme(text = element_text(size = 20), legend.position="none")

ggsave(r_conf_int_plot, filename = paste0(plot_dir, "appearance_GST_score_r_confints.pdf"), height = 6, width = 8)



### THIRD: What are TP candidates' distinctive issues?

preprocess <- function(text) {
  # preprocessing pipeline
  # 1. strip punctuation, numbers, other junk 2. remove stopwords 3. stem
  # 4. keep top 20k words
  text %>%
   tolower %>%
   tokens(remove_punct = T, remove_numbers = T, remove_symbols = T, remove_url = T) %>%
   tokens_select(pattern = stopwords("en"), selection = "remove", min_nchar = 3) %>%
   tokens_wordstem %>%
   dfm %>%
   dfm_trim(min_docfreq = 20000, docfreq_type = "rank")
}

# reduce to show by date by speaker
tr <- transcripts_by_speaker[,
  .(text = paste(text, collapse = ". ")),
  by = .(channel, show, date, transcript_number, speaker)
]

tr[, tea_party := as.numeric(speaker %in% unique(tpspeakers$speaker))]
tr[, dem := as.numeric(speaker %in% unique(demspeakers$speaker))]
tr[, rep := as.numeric((speaker %in% unique(repspeakers$speaker)) | tea_party == 1)]

tr[, month := as.character(floor_date(date, unit = "month"))]

tr_dfm <- tr[, text] %>% preprocess

nonblank_docs <- which(rowSums(tr_dfm) != 0)

model <- stm(documents = tr_dfm[nonblank_docs, ],
    K = 0,  # use Lee and Minmo algorithm
    prevalence = ~ channel + dem + rep + tea_party + month,
    data = tr[nonblank_docs, ],
    init.type = "Spectral",
    max.em.its = 100)

model %>% saveRDS(paste0(output_dir, "cable_stm_01062023.rds"))
save(tr, nonblank_docs, file = paste0(output_dir, "cable_stm_01062023_meta.RData"))
model <- readRDS(paste0(output_dir, "cable_stm_01062023.rds"))
load(paste0(output_dir, "cable_stm_01062023_meta.RData"))

# extract coefs
meta <- estimateEffect(formula = ~ channel + dem + rep + tea_party + month,
  stmobj = model,
  metadata = as.data.frame(tr[nonblank_docs, ]))

# find topics that Tea Party cands talk about most
tp_coefs <- summary(meta)$tables %>%
  map(~ as.data.table(t(.["tea_party", c("Estimate", "t value")]))) %>%
  rbindlist(idcol = "topic")

tp_coefs <- tp_coefs[order(`t value`, decreasing = T)]
tp_coefs[`t value` > 2][order(Estimate, decreasing = T)]
#  1:    24 0.023759243 12.483777
#  2:     4 0.012722378  4.500850
#  3:    64 0.010250053  3.975273
#  4:    12 0.009914329  5.794914
#  5:    17 0.009655550  4.808520
#  6:    25 0.008849363  3.654488
#  7:    55 0.005888164  2.285813
#  8:    34 0.005404877  3.762268
#  9:    45 0.005217911  3.065535
# 10:    27 0.005083099  2.913111
# 11:    46 0.004431985  2.731740
# 12:    61 0.002538112  2.333214
# 13:    30 0.002465228  2.551144
# 14:    11 0.001658409  3.947005
# 15:    28 0.001010114  5.498130

# extract informative words for labels
labelTopics(model, topics = 24)  # sharron angle senate race
labelTopics(model, topics = 4)   # tea party election prospects
labelTopics(model, topics = 64)  # immigration enforcement
labelTopics(model, topics = 12)  # racism / political correctness
labelTopics(model, topics = 17)  # supreme court
labelTopics(model, topics = 25)  # religion (Islam)
labelTopics(model, topics = 55)  # banking (bailouts)
labelTopics(model, topics = 34)  # katrina / new orleans
labelTopics(model, topics = 45)  # generic filler
labelTopics(model, topics = 27)  # haitian earthquake
labelTopics(model, topics = 46)  # cartels / drug war / hamas / gaza
labelTopics(model, topics = 61)  # scams?
labelTopics(model, topics = 30)  # FNC specific
labelTopics(model, topics = 11)  # generic filler
labelTopics(model, topics = 28)  # generic filler


# make table
out <- labelTopics(model)
words <- as.data.table(out$frex[c(24, 4, 64, 12, 17, 25), 1:7]) 
colnames(words) <- paste0("word", 1:7)

words[, top_words := paste(word1, word2, word3, word4, word5, word6, word7, sep = ", ")]


keep_coefs <- tp_coefs[`t value` > 2] %>% 
  .[order(Estimate, decreasing = T)] %>%
  .[1:6]

keep_coefs[, top_words := words$top_words]
keep_coefs[, Label := c("NV Senate Campaign",
                        "Tea Party Election Prospects",
                        "Immigration Enforcement",
                        "Racism / Political Correctness",
                        "Supreme Court",
                        "Religion (Islam)")]

###
### TABLE 1
###

keep_coefs[, .(Topic = topic, Label, Estimate, `t-stat` = `t value`, `Top Words` = top_words)] %>%
  gt %>%
  fmt_number(columns = c(Estimate, `t-stat`), decimals = 3) %>%
  fmt_integer(columns = Topic) %>%
  gtsave(paste0(table_dir, "stm_tp_topics.tex"))




# find topics that R cands talk about most
rep_coefs <- summary(meta)$tables %>%
  map(~ as.data.table(t(.["rep", c("Estimate", "t value")]))) %>%
  rbindlist(idcol = "topic")

rep_coefs <- rep_coefs[order(`t value`, decreasing = T)]
rep_coefs[`t value` > 2][order(Estimate, decreasing = T)]

#     topic    Estimate   t value
#  1:    15 0.0589831393 31.423258
#  2:    65 0.0434825273 35.660537
#  3:    59 0.0239400004 28.648527
#  4:    40 0.0198171961 26.100721
#  5:     7 0.0194997881 17.538498
#  6:    29 0.0156319567 15.165797
#  7:    23 0.0097541618  9.822193
#  8:    64 0.0092150432 11.504775
#  9:    12 0.0056057353  7.003901
# 10:    55 0.0054835179  4.280993
# 11:    18 0.0052016625  4.292135
# 12:    66 0.0043251888  3.671434
# 13:     4 0.0043201133  3.312153
# 14:    56 0.0040435718  3.523757
# 15:    24 0.0038496925  9.508381
# 16:    19 0.0031551716  4.782625
# 17:    54 0.0027369612  7.649820
# 18:    37 0.0017720165  3.561740
# 19:     8 0.0012214195  2.758562
# 20:    67 0.0002704252  2.920670

labelTopics(model, topics = 15)  # tax / budget / stimulus
labelTopics(model, topics = 65)  # unemployment
labelTopics(model, topics = 59)  # legislative process
labelTopics(model, topics = 40)  # house politics
labelTopics(model, topics = 7)   # healthcare (ACA)
labelTopics(model, topics = 29)  # generic filler
labelTopics(model, topics = 23)  # senate politics
labelTopics(model, topics = 64)  # immigration enforcement
labelTopics(model, topics = 12)  # racism / political correctness
labelTopics(model, topics = 55)  # banking (bailouts)
labelTopics(model, topics = 18)  # homeland security
labelTopics(model, topics = 66)  # gitmo / torture


# make table
out <- labelTopics(model)
words <- as.data.table(out$frex[c(15, 65, 59, 40, 7, 23), 1:7])
colnames(words) <- paste0("word", 1:7)

words[, top_words := paste(word1, word2, word3, word4, word5, word6, word7, sep = ", ")]


keep_coefs <- rep_coefs[`t value` > 2] %>%
  .[order(Estimate, decreasing = T)] %>%
  .[(topic %in% c(15, 65, 59, 40, 7, 23))]

keep_coefs[, top_words := words$top_words]
keep_coefs[, Label := c("Taxes / Budget / Stimulus",
                        "(Un)employment",
                        "Legislative Process",
                        "House Politics",
                        "Healthcare",
                        "Senate Politics")]

###
### TABLE 2
###

keep_coefs[, .(Topic = topic, Label, Estimate, `t-stat` = `t value`, `Top Words` = top_words)] %>%
  gt %>%
  fmt_number(columns = c(Estimate, `t-stat`), decimals = 3) %>%
  fmt_integer(columns = Topic) %>%
  gtsave(paste0(table_dir, "stm_rep_topics.tex"))


## FNC topics
fnc_coefs <- summary(meta)$tables %>%
  map(~ as.data.table(t(.["channelfox", c("Estimate", "t value")]))) %>%
  rbindlist(idcol = "topic")

fnc_coefs[`t value` > 2][order(Estimate, decreasing = T)]

#  1:    30 0.0228197434 85.251544
#  2:    15 0.0195655702 48.490078
#  3:    42 0.0180567350 49.749200
#  4:    20 0.0164058813 56.323665
#  5:     4 0.0159594297 39.581447
#  6:    29 0.0116716833 36.007493
#  7:    66 0.0107209042 33.146144
#  8:    59 0.0091443295 51.878559
#  9:     2 0.0090846297 34.818998
# 10:    23 0.0084587022 26.208565

labelTopics(model, topics = 30)  # FNC-specific
labelTopics(model, topics = 15)  # tax / budget / stimulus
labelTopics(model, topics = 42)  # O'Reilly language
labelTopics(model, topics = 20)  # Hannity language
labelTopics(model, topics = 4)   # Tea party election prospects
labelTopics(model, topics = 29)  # generic filler
labelTopics(model, topics = 66)  # terrorism / gitmo / torture
labelTopics(model, topics = 59)  # legislative process
labelTopics(model, topics = 2)   # israel-palestine conflict
labelTopics(model, topics = 23)  # senate politics



cossim <- function(x, y) sum(x * y) / (sqrt(sum(x^2)) * sqrt(sum(y^2)))

cossim(tp_coefs$Estimate, fnc_coefs$Estimate)
cossim(rep_coefs$Estimate, fnc_coefs$Estimate)

msnbc_coefs <- summary(meta)$tables %>%
  map(~ as.data.table(t(.["channelmsnbc", c("Estimate", "t value")]))) %>%
  rbindlist(idcol = "topic")
cossim(tp_coefs$Estimate, msnbc_coefs$Estimate)
cossim(rep_coefs$Estimate, msnbc_coefs$Estimate)

## first plot coverage of TP-emphasized topics
labelTopics(model, topics = 24)  # sharron angle senate race
labelTopics(model, topics = 4)   # tea party election prospects
labelTopics(model, topics = 64)  # immigration enforcement
labelTopics(model, topics = 12)  # racism / political correctness
labelTopics(model, topics = 17)  # supreme court
labelTopics(model, topics = 25)  # religion (Islam)
labelTopics(model, topics = 55)  # banking (bailouts)
labelTopics(model, topics = 34)  # katrina / new orleans
labelTopics(model, topics = 45)  # generic filler
labelTopics(model, topics = 27)  # haitian earthquake
labelTopics(model, topics = 46)  # cartels / drug war / hamas / gaza
labelTopics(model, topics = 61)  # scams?
labelTopics(model, topics = 30)  # FNC specific
labelTopics(model, topics = 11)  # generic filler
labelTopics(model, topics = 28)  # generic filler

tr[nonblank_docs, topic_tea_party := model$theta[, 4]]
tr[nonblank_docs, topic_sharron_angle := model$theta[, 24]]
tr[nonblank_docs, topic_immigration := model$theta[, 64]]
tr[nonblank_docs, topic_racism := model$theta[, 12]]
tr[nonblank_docs, topic_islam := model$theta[, 25]]
tr[nonblank_docs, topic_supcourt := model$theta[, 17]]


labelTopics(model, topics = 15)  # tax / budget / stimulus
labelTopics(model, topics = 65)  # unemployment
labelTopics(model, topics = 59)  # legislative process
labelTopics(model, topics = 40)  # house politics
labelTopics(model, topics = 7)   # healthcare (ACA)
labelTopics(model, topics = 29)  # generic filler
labelTopics(model, topics = 23)  # senate politics

tr[nonblank_docs, topic_taxes := model$theta[, 15]]
tr[nonblank_docs, topic_hard_work := model$theta[, 65]]
tr[nonblank_docs, topic_legislative_process := model$theta[, 59]]
tr[nonblank_docs, topic_house := model$theta[, 40]]
tr[nonblank_docs, topic_healthcare := model$theta[, 7]]
tr[nonblank_docs, topic_senate := model$theta[, 23]]


tr[, yearmo := ym(paste(year(date), str_pad(month(date), pad = "0", width = 2), sep = "-"))]

# aggregate to channel level
# excluding candidate appearances
channel_weights <-
  tr[tea_party == 0 & dem == 0 & rep == 0, map(.SD, weighted.mean, w = nchar(text), na.rm = T),
      by = .(yearmo, channel),
      .SDcols = grep("topic_", colnames(tr), value = T)] %>%
    melt(id.vars = c("channel", "yearmo"),
         value.name = "weight",
         variable.name = "topic") %>%
    .[, topic := sub("topic_", "", topic)] %>%
    .[, Channel := recode(channel, fox = "FNC", cnn = "CNN", msnbc = "MSNBC")]

channel_weights[, topic := recode_factor(topic,
  tea_party = "Tea Party Election Prospects",
  immigration = "Immigration Enforcement",
  racism = "Racism / Political Correctness",
  islam = "Islam",
  supcourt = "Supreme Court",
  sharron_angle = "NV Senate Campaign",
  taxes = "Tax / Spending / Stimulus",
  hard_work = "Unemployment",
  legislative_process = "Legislative Process",
  house = "US House Politics",
  healthcare = "Healthcare (ACA)",
  senate = "US Senate Politics")]

###
### FIGURE 2 b)
###

channel_weights[as.integer(topic) %in% 1:6] %>%
  ggplot(aes(x = yearmo, y = weight, group = Channel)) +
  geom_line(aes(colour = Channel)) +
  scale_colour_manual(values = c(CNN = "mediumpurple2", FNC = "darkred", MSNBC = "cornflowerblue")) +
  labs(x = "Date", y = "Topic Weight") +
  theme(
    axis.text.x = element_text(size=13, angle = 90, vjust = 1, hjust = 1),
    axis.text.y = element_text(size=13),
    axis.title = element_text(size=15),
    legend.title = element_text(size=15),
    legend.text = element_text(size=13),
    strip.text.x = element_text(size=12)
    ) +
  ylim(0, 0.15) +
  scale_x_date(date_breaks = "3 months",
               date_labels = "%b %y") +
  facet_wrap(~ topic)

ggsave(filename = paste0(plot_dir, "channel_topic_weights.pdf"), height = 6, width = 10)

cand_weights <-
  tr[, Party := if_else(dem == 1, "Democrat", if_else(tea_party == 1, "Tea Party", "Mainstream Rep."))] %>%
  .[tea_party == 1 | dem == 1 | rep == 1, map(.SD, weighted.mean, w = nchar(text), na.rm = T),
      by = .(yearmo, Party),
      .SDcols = grep("topic_", colnames(tr), value = T)] %>%
    melt(id.vars = c("Party", "yearmo"),
         value.name = "weight",
         variable.name = "topic") %>%
    .[, topic := sub("topic_", "", topic)]
    

cand_weights[, topic := recode_factor(topic,
  tea_party = "Tea Party Election Prospects",
  immigration = "Immigration Enforcement",
  racism = "Racism / Political Correctness",
  islam = "Islam",
  supcourt = "Supreme Court",
  sharron_angle = "NV Senate Campaign",
  taxes = "Tax / Spending / Stimulus",
  hard_work = "Unemployment",
  legislative_process = "Legislative Process",
  house = "US House Politics",
  healthcare = "Healthcare (ACA)",
  senate = "US Senate Politics")]


###
### FIGURE 2 a)
###

cand_weights[as.integer(topic) %in% 1:6] %>%
  ggplot(aes(x = yearmo, y = weight, group = Party)) +
  geom_line(aes(colour = Party, linetype = Party)) +
  scale_colour_manual(values = c(Democrat = "blue", 
                                 `Mainstream Rep.` = "red",
                                 `Tea Party` = "brown")) +
  scale_linetype_manual(values = c(Democrat = 2, 
                                 `Mainstream Rep.` = 3,
                                 `Tea Party` = 1)) +
  labs(x = "Date", y = "Topic Weight") +
  theme(
    axis.text.x = element_text(size=13, angle = 90, vjust = 1, hjust = 1),
    axis.text.y = element_text(size=13),
    axis.title = element_text(size=15),
    legend.title = element_text(size=15),
    legend.text = element_text(size=13),
    strip.text.x = element_text(size=12)
    ) +
  ylim(0,0.15) +
  scale_x_date(date_breaks = "3 months",
               date_labels = "%b %y") +
  facet_wrap(~ topic)

ggsave(filename = paste0(plot_dir, "candidate_topic_weights.pdf"), height = 6, width = 10)


###
### FIGURE B.2.1 b)
###

channel_weights[as.integer(topic) %in% 7:12] %>%
  ggplot(aes(x = yearmo, y = weight, group = Channel)) +
  geom_line(aes(colour = Channel)) +
  scale_colour_manual(values = c(CNN = "mediumpurple2", FNC = "darkred", MSNBC = "cornflowerblue")) +
  labs(x = "Date", y = "Topic Weight") +
  theme(
    axis.text.x = element_text(size=13, angle = 90, vjust = 1, hjust = 1),
    axis.text.y = element_text(size=13),
    axis.title = element_text(size=15),
    legend.title = element_text(size=15),
    legend.text = element_text(size=13),
    strip.text.x = element_text(size=12)
    ) +
  ylim(0,0.15) +
  scale_x_date(date_breaks = "3 months",
               date_labels = "%b %y") +
  facet_wrap(~ topic)

ggsave(filename = paste0(plot_dir, "channel_topic_weights_R.pdf"), height = 6, width = 10)

###
### FIGURE B.2.1 a)
###

cand_weights[as.integer(topic) %in% 7:12] %>%
  ggplot(aes(x = yearmo, y = weight, group = Party)) +
  geom_line(aes(colour = Party, linetype = Party)) +
  scale_colour_manual(values = c(Democrat = "blue", 
                                 `Mainstream Rep.` = "red",
                                 `Tea Party` = "brown")) +
  scale_linetype_manual(values = c(Democrat = 2, 
                                 `Mainstream Rep.` = 3,
                                 `Tea Party` = 1)) +
  labs(x = "Date", y = "Topic Weight") +
  theme(
    axis.text.x = element_text(size=13, angle = 90, vjust = 1, hjust = 1),
    axis.text.y = element_text(size=13),
    axis.title = element_text(size=15),
    legend.title = element_text(size=15),
    legend.text = element_text(size=13),
    strip.text.x = element_text(size=12)
    )  +
  ylim(0,0.2)+
  scale_x_date(date_breaks = "3 months",
               date_labels = "%b %y") +
  facet_wrap(~ topic)

ggsave(filename = paste0(plot_dir, "candidate_topic_weights_R.pdf"), height = 6, width = 10)